Supplementary Figures for the paper have been generated with scripts similar to these, and some manual post-processing.

require(CNAqc)
require(mobster)
require(VIBER)
require(dplyr)

source('auxiliary_functions.R', verbose = FALSE)

Set06 plots

Load the data required for the plots.

# Set6 - prepare data as in the analysis vignette
Set6_samples = paste0('Set6_', c(42, 44, 45:48))
Set6_purity = pio:::nmfy(Set6_samples, c(0.66, 0.72, 0.80, 0.80, 0.80, 0.80))
Set6_mutations = read_csv('./data/Set6_mutations.csv')
#> Parsed with column specification:
#> cols(
#>   .default = col_double(),
#>   id = col_character(),
#>   chr = col_character(),
#>   alt = col_character(),
#>   cosmic = col_logical(),
#>   function. = col_character(),
#>   gene = col_character(),
#>   mutlocation = col_character(),
#>   ref = col_character(),
#>   region = col_character(),
#>   vartype = col_character()
#> )
#> See spec(...) for full column specifications.
Set6_CNA = read_csv('./data/Set6_cna.csv')
#> Parsed with column specification:
#> cols(
#>   chr = col_character(),
#>   from = col_double(),
#>   length = col_double(),
#>   to = col_double(),
#>   Set6_42_Minor = col_double(),
#>   Set6_42_Major = col_double(),
#>   Set6_44_Minor = col_double(),
#>   Set6_44_Major = col_double(),
#>   Set6_45_Minor = col_double(),
#>   Set6_45_Major = col_double(),
#>   Set6_46_Minor = col_double(),
#>   Set6_46_Major = col_double(),
#>   Set6_47_Minor = col_double(),
#>   Set6_47_Major = col_double(),
#>   Set6_48_Minor = col_double(),
#>   Set6_48_Major = col_double()
#> )

# CNA mapping
Set6_CNAqc_objects = readRDS("./Set6/Set6_mapped_calls.rds")

# Fits
Set6_mobster_fits = readRDS("./Set6/Set6_mobster_fits.rds")
Set6_mobster_viber_fit = readRDS("./Set6/Set6_mobster_viber_fit.rds")
Set6_mobster_viber_fit_heuristics = readRDS("./Set6/Set6_mobster_viber_fit_heuristics.rds")
Set6_standard_fit = readRDS("./Set6/Set6_standard_fit.rds")
Set6_standard_fit_heuristics = readRDS("./Set6/Set6_standard_fit_heuristics.rds")

Copy Number plots (circular).

plot_cna = lapply(Set6_samples, 
       function(x) 
         CNAqc::plot_segments(
           Set6_CNAqc_objects[[x]], 
           circular = TRUE) +
           labs(title = paste0(x, " CNA segments (all)"))
       )

ggpubr::ggarrange(plotlist = plot_cna)

Mutation data (VAF and coverage).

muts = Reduce(bind_rows, 
              lapply(
                Set6_mobster_fits %>% names, 
                function(x) Set6_mobster_fits[[x]]$data %>% mutate(sample = x))
              )

p1 = ggplot(muts, aes(VAF, fill = sample)) +
  geom_histogram(binwidth = 0.01) +
  xlim(0, 1) +
  facet_wrap( ~ sample, nrow = 1) +
  mobster:::my_ggplot_theme() +
  ggthemes::scale_fill_ptol() +
  labs(title = "VAF diploid mutations")

p2 = ggplot(muts, aes(DP, fill = sample)) +
  geom_histogram(bins = 100) +
  scale_x_log10() +
  facet_wrap( ~ sample, nrow = 1) +
  mobster:::my_ggplot_theme() +
  ggthemes::scale_fill_ptol() +
  labs(title = "Coverage diploid mutations")

p3 = ggplot(muts, aes(x = DP, y = VAF, color = sample)) +
  geom_point(size = .3, alpha = .5) +
  scale_x_log10() +
  facet_wrap( ~ sample, nrow = 1) +
  mobster:::my_ggplot_theme() +
  ggthemes::scale_color_ptol() +
  labs(title = "VAF vs coverage diploid mutations") 

ggpubr::ggarrange(p1, p2, p3, nrow = 3, ncol = 1)

Clusters

Plot of clusters’ points stats, which we use to identify clusters of potential interests.

In left, the proportion of points per cluster that have VAF > 0 in a number of biopsies, which is used to understand which cluster is composed of mostly private mutations. In right, mixing proportions of the fit; note that the actual tumour proportion with respect to the overall mutational burden should be computed including (as normalisation constant), tail mutations here removed by MOBSTER.

The first line shows the plot for the fit before the heuristic, the second the fit after.

ggpubr::ggarrange(
  plot_sample_occurrences(Set6_mobster_viber_fit),
  VIBER::plot_mixing_proportions(Set6_mobster_viber_fit),
  plot_sample_occurrences(Set6_mobster_viber_fit_heuristics),
  VIBER::plot_mixing_proportions(Set6_mobster_viber_fit_heuristics),
  nrow = 2, 
  ncol = 2)

Clusters mapping

Mapping of clusters obtained from the analysis with MOBSTER, and without. Tail mutations removed by MOBSTER are flagged as “Missing”.

Mapping before the heuristic.

plot_mapping(Set6_mutations, Set6_mobster_fits, Set6_mobster_viber_fit, Set6_standard_fit)

Mapping after the heuristic.

plot_mapping(Set6_mutations, Set6_mobster_fits, Set6_mobster_viber_fit_heuristics, Set6_standard_fit_heuristics)

Set07 plots

As for Set06, we Load the data required for the plots.

Set7_samples = paste0('Set7_', c(55, 57, 59, 62))
Set7_purity = pio:::nmfy(Set7_samples, c(.88, .88, .88, .8))
Set7_mutations = readr::read_csv("./data/Set7_mutations.csv", col_types = readr::cols())
Set7_CNA = readr::read_csv("./data/Set7_cna.csv", col_types = readr::cols())

# CNA mapping
Set7_CNAqc_objects = readRDS("./Set7/Set7_mapped_calls.rds")

# Fits
Set7_mobster_fits = readRDS("./Set7/Set7_mobster_fits.rds")
Set7_mobster_viber_fit = readRDS("./Set7/Set7_mobster_viber_fit.rds")
Set7_mobster_viber_fit_heuristics = readRDS("./Set7/Set7_mobster_viber_fit_heuristics.rds")
Set7_standard_fit = readRDS("./Set7/Set7_standard_fit.rds")
Set7_standard_fit_heuristics = readRDS("./Set7/Set7_standard_fit_heuristics.rds")

Copy Number plots (circular).

plot_cna = lapply(Set7_samples, 
       function(x) 
         CNAqc::plot_segments(
           Set7_CNAqc_objects[[x]], 
           circular = TRUE) +
           labs(title = paste0(x, " CNA segments (all)"))
       )

ggpubr::ggarrange(plotlist = plot_cna)

Mutation data (VAF and coverage).

muts = Reduce(bind_rows, 
              lapply(
                Set7_mobster_fits %>% names, 
                function(x) Set7_mobster_fits[[x]]$data %>% mutate(sample = x))
              )

p1 = ggplot(muts, aes(VAF, fill = sample)) +
  geom_histogram(binwidth = 0.01) +
  xlim(0, 1) +
  facet_wrap( ~ sample, nrow = 1) +
  mobster:::my_ggplot_theme() +
  ggthemes::scale_fill_ptol() +
  labs(title = "VAF diploid mutations")

p2 = ggplot(muts, aes(DP, fill = sample)) +
  geom_histogram(bins = 100) +
  scale_x_log10() +
  facet_wrap( ~ sample, nrow = 1) +
  mobster:::my_ggplot_theme() +
  ggthemes::scale_fill_ptol() +
  labs(title = "Coverage diploid mutations")

p3 = ggplot(muts, aes(x = DP, y = VAF, color = sample)) +
  geom_point(size = .3, alpha = .5) +
  scale_x_log10() +
  facet_wrap( ~ sample, nrow = 1) +
  mobster:::my_ggplot_theme() +
  ggthemes::scale_color_ptol() +
  labs(title = "VAF vs coverage diploid mutations") 

ggpubr::ggarrange(p1, p2, p3, nrow = 3, ncol = 1)

Clusters

Plot of clusters’ points, as for Set06.

ggpubr::ggarrange(
  plot_sample_occurrences(Set7_mobster_viber_fit),
  VIBER::plot_mixing_proportions(Set7_mobster_viber_fit),
  plot_sample_occurrences(Set7_mobster_viber_fit_heuristics),
  VIBER::plot_mixing_proportions(Set7_mobster_viber_fit_heuristics),
  nrow = 2, 
  ncol = 2)

Clusters mapping

Before the heuristic.

plot_mapping(Set7_mutations, Set7_mobster_fits, Set7_mobster_viber_fit, Set7_standard_fit)

After the heuristic.

plot_mapping(Set7_mutations, Set7_mobster_fits, Set7_mobster_viber_fit_heuristics, Set7_standard_fit_heuristics)

dN/dS analyses

Same pipeline to have a look at tail mutations, for example.

# L ~ list of mobster fitts
pipe_dnds_tails = function(L)
{
  require(dndscv)
  
  input_dnds =
    Reduce(bind_rows,
           lapply(L %>% names,
                  function(x) {
                    L[[x]]$data %>% mutate(sampleID = paste(names(L), collapse = ', '))
                  })) %>%
    filter(cluster == 'Tail') %>%
    select(chr, from, ref, alt, cluster, sampleID) %>%
    distinct() %>%
    rename(pos = from) %>%
    select(sampleID, chr, pos, ref, alt) %>%
    mutate(pos = as.integer(pos))

  # hg19
  input_dnds$chr = gsub('chr', '', input_dnds$chr)
  
  dndscv::dndscv(input_dnds, refdb = "hg19")
}

Split patients and run

# Set6
Set6_dnds = pipe_dnds_tails(readRDS("./Set6/Set6_mobster_fits.rds"))
#> Loading required package: dndscv
#> [1] Loading the environment...
#> [2] Annotating the mutations...
#>     Note: 7 mutations removed for exceeding the limit of mutations per gene per sample (see the max_muts_per_gene_per_sample argument in dndscv)
#> [3] Estimating global rates...
#> [4] Running dNdSloc...
#> [5] Running dNdScv...
#>     Regression model for substitutions (theta = 0.0574).
print(Set6_dnds$globaldnds)
#>      name       mle     cilow    cihigh
#> wmis wmis 0.6651705 0.4838238 0.9144894
#> wnon wnon 0.5193101 0.1526613 1.7665446
#> wspl wspl 0.8030913 0.3336526 1.9330153
#> wtru wtru 0.6833264 0.3281163 1.4230774
#> wall wall 0.6678600 0.4875842 0.9147896

# Set7
Set7_dnds = pipe_dnds_tails(readRDS("./Set7/Set7_mobster_fits.rds"))
#> [1] Loading the environment...
#> [2] Annotating the mutations...
#>     Note: 1 mutations removed for exceeding the limit of mutations per gene per sample (see the max_muts_per_gene_per_sample argument in dndscv)
#> [3] Estimating global rates...
#> [4] Running dNdSloc...
#> [5] Running dNdScv...
#>     Regression model for substitutions (theta = 0.307).
print(Set7_dnds$globaldnds)
#>      name       mle     cilow   cihigh
#> wmis wmis 0.9655637 0.7587956 1.228675
#> wnon wnon 1.5777026 0.8852469 2.811809
#> wspl wspl 1.1608594 0.6289249 2.142695
#> wtru wtru 1.3563654 0.8741619 2.104561
#> wall wall 0.9905695 0.7802953 1.257509